
## A Research Note on the Prevalence
## of Housing Eviction among Children
## Born in American Cities

## CODE PART D:
## Alternative specification: Random forest

## Ian Lundberg and Louis Donnelly

## Department of Sociology, Office of Population Research,
## and Center for Research on Child Wellbeing
## Princeton University

## Code by Ian Lundberg (ilundberg@princeton.edu)
## Last updated 10 May 2018

## Set working directory
setwd("C:/Users/iandl/Documents/FF Eviction Prevalence")

## Load files from prior code chunks
load("Intermediate/d.Rdata")
load("Intermediate/scale.factors.Rdata")
load("Intermediate/scaled.long.Rdata")
load("Intermediate/means.Rdata")
load("Intermediate/modelForms.Rdata")
load("Intermediate/newData.Rdata")
load("Intermediate/IDs.Rdata")
load("Intermediate/specification_numbers.Rdata")

## Load required packages
library(tidyverse)
library(reshape2)
library(doParallel)
library(doRNG)
library(ranger) # See Wright & Ziegler 2017. This is a random forest package designed to be fast with high-dimensional data.


#############################
## APPENDIX: RANDOM FOREST ##
#############################

## We only consider the final model formula
modelForm <- modelForms[[length(modelForms)]]

## Prep the data for the random forest
preForRF <- scaled.long %>%
  left_join(d %>% select(idnum, m1natwt),
            by = "idnum") %>%
  filter(imp == 1 & !is.na(ev) & !is.na(m1natwt)) %>%
  arrange(m1city, -m1natwt, idnum)
X <- model.matrix(modelForm,
                  data = preForRF)
cities <- model.matrix(~ -1 + factor(m1city),
                       data = preForRF)[,-1]
forRF <- data.frame(ev = preForRF$ev,
                    m1natwt = preForRF$m1natwt,
                    idnum = preForRF$idnum,
                    X,cities) %>%
  arrange(ev, preForRF$m1city, -preForRF$m1natwt, preForRF$idnum)

preForRFTest_preExpansion <- scaled.long %>%
  left_join(d %>% select(idnum, m1natwt),
            by = "idnum") %>%
  filter(imp == 1) %>%
  filter(!is.na(m1natwt)) %>%
  group_by(idnum) %>%
  filter(1:n() == 1) %>%
  mutate(ev = NA) %>%
  arrange(m1city, -m1natwt, idnum)

preForRFTest <- preForRFTest_preExpansion[rep(1:nrow(preForRFTest_preExpansion), each = 15),] %>%
  group_by(idnum) %>%
  mutate(age = 1:n()) %>%
  mutate(recession = ifelse(age >= 8 & age <= 9,1,0) - means["recession"],
         age = (age - means["age"]) / scale.factors["age"]) %>%
  group_by()
X <- model.matrix(modelForm,
                  data = preForRFTest)
cities <- model.matrix(~ -1 + factor(m1city),
                       data = preForRFTest)[,-1]
forRFTest <- data.frame(ev = preForRFTest$ev,
                        m1natwt = preForRFTest$m1natwt,
                        idnum = preForRFTest$idnum,
                        X,cities)

if(!all(colnames(forRFTest) == colnames(forRF))) {
  stop("ERROR: Training and test column names do not match")
}

## Draw fold assignments
## such that every 5 observations are
## assigned to a random order of the folds 1-5
set.seed(08544)
folds <- as.vector(
  replicate(ceiling(nrow(forRF) / 5), sample(1:5))
)[1:nrow(forRF)]

## Set the candidate mtry values
candidates <- 1:4

cl <- makeCluster(5, outfile = "")
registerDoParallel(cl)
set.seed(08544)
## Choose an estimator
rf_cv <- foreach (
  i = 1:(length(candidates)*length(unique(folds))),
  .packages = c("tidyverse","ranger","foreach"),
  .combine = "rbind"
) %dorng% {
  candidate <- candidates[ceiling(i / length(unique(folds)))]
  fold <- rep(1:length(unique(folds)), length(candidates))[i]
  rangerFit_withID <- ranger(formula = ev ~ .,
                             data = forRF[folds != fold,] %>%
                               select(-m1natwt),
                             num.trees = 500,
                             mtry = candidate,
                             min.node.size = 5,
                             respect.unordered.factors = "order",
                             case.weights = forRF$m1natwt[forRF$folds != fold])
  rangerFit_withoutID <- ranger(formula = ev ~ .,
                                data = forRF[folds != fold,] %>%
                                  select(-m1natwt,-idnum),
                                num.trees = 500,
                                mtry = candidate,
                                min.node.size = 5,
                                respect.unordered.factors = "order",
                                case.weights = forRF$m1natwt[forRF$folds != fold])
  cvPred <-  data.frame(
    candidate = candidate,
    fold = fold,
    weight = forRF[folds == fold,"m1natwt"],
    phat_withID = predict(rangerFit_withID,
                          data = forRF[folds == fold,] %>%
                            select(-m1natwt))$predictions,
    phat_withoutID = predict(rangerFit_withoutID,
                             data = forRF[folds == fold,] %>%
                               select(-m1natwt))$predictions,
    y = forRF$ev[folds == fold]
  )
  return(cvPred)
}
stopCluster(cl)

#########################################
## Figure A9a.                         ##
## Plot the cross validation results   ##
## for the random forest               ##
#########################################

rf_cv %>%
  select(-fold) %>%
  melt(id = c("candidate","y","weight"),
       variable.name = "model",
       value.name = "phat") %>%
  group_by(candidate, model) %>%
  summarize(mse.weighted = weighted.mean((y - phat) ^ 2,
                                         w = weight),
            mse.unweighted = mean((y - phat) ^ 2)) %>%
  mutate(`Respondent ID` = ifelse(model == "phat_withID",
                                  "included as a predictor",
                                  "not included")) %>%
  ggplot(aes(x = candidate, y = mse.weighted,
             color = `Respondent ID`)) +
  geom_line() + 
  geom_point() +
  xlab("Number of variables sampled for\npossible splitting at each node") +
  ylab("Mean squared error\n(10-fold cross validation, weighted)") +
  theme(legend.position = "none") +
  ggtitle("A. Cross-validated prediction error") +
  ggsave("Output/rfCVWeighted.pdf",
         height = 3, width = 5)

## Fit each option on the full data
cl <- makeCluster(length(candidates), outfile = "")
registerDoParallel(cl)
set.seed(08544)
rf_fit <- foreach (
  candidate = 1:length(candidates),
  .packages = c("tidyverse","ranger","foreach"),
  .combine = "rbind"
) %dorng% {
  fullFit.withID <- ranger(formula = ev ~ .,
                           data = forRF %>%
                             select(-m1natwt),
                           num.trees = 500,
                           mtry = candidate,
                           min.node.size = 5,
                           respect.unordered.factors = "order",
                           case.weights = forRF$m1natwt)
  
  fullFit.withoutID <- ranger(formula = ev ~ .,
                              data = forRF %>%
                                select(-m1natwt,-idnum),
                              num.trees = 500,
                              mtry = candidate,
                              min.node.size = 5,
                              respect.unordered.factors = "order",
                              case.weights = forRF$m1natwt)
  
  estimate.withID <- forRFTest %>%
    select(idnum,m1natwt) %>%
    group_by() %>%
    mutate(phat = predict(fullFit.withID,
                          data = forRFTest %>% 
                            select(-m1natwt))$predictions) %>%
    group_by(idnum) %>%
    summarize(cumProb = 1 - prod(1 - phat),
              m1natwt = mean(m1natwt)) %>%
    group_by() %>%
    summarize(cumProb = weighted.mean(cumProb, w = m1natwt))
  
  estimate.withoutID <- forRFTest %>%
    select(idnum,m1natwt) %>%
    group_by() %>%
    mutate(phat = predict(fullFit.withoutID,
                          data = forRFTest %>% 
                            select(-m1natwt,-idnum))$predictions) %>%
    group_by(idnum) %>%
    summarize(cumProb = 1 - prod(1 - phat),
              m1natwt = mean(m1natwt)) %>%
    group_by() %>%
    summarize(cumProb = weighted.mean(cumProb, w = m1natwt))
  
  return(data.frame(
    candidate = candidate,
    model = c("withID","withoutID"),
    estimate = c(estimate.withID$cumProb, estimate.withoutID$cumProb)
  ))
}
stopCluster(cl)

###########################################
## Figure A9b.                           ##
## Plot the implied eviction prevalence  ##
## for the random forest                 ##
###########################################

rf_fit %>%
  mutate(`Respondent ID` = ifelse(model == "withID",
                                  "included as a predictor",
                                  "not included")) %>%
  ggplot(aes(x = candidate, y = estimate,
             color = `Respondent ID`)) +
  geom_line() + 
  geom_point() +
  xlab("Number of variables sampled for\npossible splitting at each node") +
  ylab("Implied proportion evicted\nby age 15") +
  ggtitle("B. Implied eviction prevalence") +
  theme(legend.position = "bottom") +
  ggsave("Output/rfCVEstimate.pdf",
         height = 3.5, width = 5)

sink("Output/rf_text_results.txt")
print("Candidate column indicates the mtry value")
print("Model column indicates whether respondent ID was included as a predictor")
print("Cross-validated MSE")
print(rf_cv %>%
        select(-fold) %>%
        melt(id = c("candidate","y","weight"),
             variable.name = "model",
             value.name = "phat") %>%
        group_by(candidate, model) %>%
        summarize(mse.weighted = weighted.mean((y - phat) ^ 2,
                                               w = weight),
                  mse.unweighted = mean((y - phat) ^ 2)) %>%
        arrange(model, candidate)
)
print("Implied prevalence from full fit")
print(rf_fit)
sink()

